####**********************************************
#### FINDING AND SAVING SUMMARY STATISTICS *******
####**********************************************

week_id <- read.csv("data/official_statistics/Week_Year_Key.csv")
emp_demos <- readRDS("data/derived/emp_demos.Rds")
client_demos <- readRDS("data/derived/client_demos.Rds")

## creating statistics tables
stat_firm <- tibble()
stat_industry <- tibble()
stat_firm_size <- tibble() ## firm size dist. in Q1 of 2023
stat_firm_size_23 <- tibble() ## firm size dist. in 2023
stat_worker <- tibble()
stat_hourly <- tibble() 
stat_age <- tibble()
stat_gender <- tibble()
stat_category <- tibble() ## full-time/part-time status
## based on the ee_category field in the data
stat_category2 <- tibble() ## full-time/part-time status
## based on the worker's weekly working hours (<35 part-time, >=35 full-time)
stat_ethnicity <- tibble()
stat_wage <- tibble()
stat_worker_industry <- tibble()
stat_region <- tibble()

# loop over 220 subsamples of the micro data and calculate summary stats
# for each one of the samples. Then, in 2-statistics_output.R we will use
# these stats by sample-year and aggregate them to create stats by year.

ptm <- proc.time()
for (i in 1:220){
  ## reading the wage micro data sample and merging with firms micro data
  wage_samp <- readRDS(paste0("data/micro/wage_50_samp_", i, ".rds")) %>%
    left_join(week_id, by = "week") %>%
    filter(year >= 2010 & year <= 2023) %>%
    inner_join(client_demos, by = "clt_client") %>%
    distinct(year, person_job_id, .keep_all = T) %>%
    select(person_job_id, clt_client, year, quarter, clt_region,
           clt_state, clt_industry_code_2d,
           rate_reg, total_pay, worker_type, hours_reg, total_hours)
  
  ## number of firms per year
  stat_temp <-  wage_samp %>% group_by(year) %>%
    summarise(n_firm = n_distinct(clt_client)) %>% mutate(sample = i)
  stat_firm <- bind_rows(stat_firm, stat_temp)
  
  ## number of workers per industry and year
  stat_temp <-  wage_samp %>% group_by(year, clt_industry_code_2d) %>% summarise(n = n()) %>%
    mutate(sample = i)
  stat_industry <- bind_rows(stat_industry, stat_temp)
  
  # firms' size in the first quarter of 2023
  stat_temp <-  wage_samp %>% filter(year == 2023 & quarter == 1) %>%
    group_by(clt_client) %>% summarise(size = n()) %>% mutate(sample = i)
  stat_firm_size <- bind_rows(stat_firm_size, stat_temp)
  
  ## firms' size in 2023
  stat_temp <-  wage_samp %>% filter(year == 2023) %>%
    group_by(clt_client) %>% summarise(size = n()) %>% mutate(sample = i)
  stat_firm_size_23 <- bind_rows(stat_firm_size_23, stat_temp)
  
  ## number of workers per year
  stat_temp <- wage_samp %>% group_by(year) %>% summarise(n_job = n()) %>%
    mutate(sample = i)
  stat_worker <- bind_rows(stat_worker, stat_temp)
  
  ## number of hourly and salaried workers per year
  stat_temp <- wage_samp %>% group_by(year, worker_type) %>% summarise(n = n()) %>%
    mutate(sample = i)
  stat_hourly <- bind_rows(stat_hourly, stat_temp)
  
  ### hourly workers' characteristics
  
  ## restricting the sample to hourly workers with valid wage data
  wage_samp <- wage_samp %>% filter(year >= 2014 & year <= 2023 &
                                      worker_type %in% c("hourly", "tipped") &
                                      rate_reg > 0 & !is.na(rate_reg) &
                                      !is.na(rate_reg) &
                                      hours_reg >= 1 & !is.na(hours_reg))
  # !is.na(clt_state) & !is.na(clt_industry_code_2d) &
  # clt_industry_code_2d %in% industries$industry_code)
  
  stat_temp <- wage_samp %>% group_by(rate_reg) %>%
    mutate(rate_reg_topcode = min(rate_reg, 1000),
           rate_reg_topcode_CPS = ifelse(hours_reg < 30,
                                         min(rate_reg, 100),
                                         min(rate_reg, 2885/hours_reg))) %>%
    ungroup() %>%
    summarise(mean_wage = mean(rate_reg_topcode),
              mean_wage_cps_topcode = mean(rate_reg_topcode_CPS), n = n()) %>%
    mutate(sample = i)
  stat_wage <- bind_rows(stat_wage, stat_temp)
  
  ## merging the wage micro data with employees' data
  wage_samp <- wage_samp %>% left_join(emp_demos, by = "person_job_id")
  
  stat_temp <- wage_samp %>% group_by(ee_gender) %>% summarise(n = n()) %>%
    mutate(sample = i)
  stat_gender <- bind_rows(stat_gender, stat_temp)
  
  stat_temp <- wage_samp %>% group_by(ee_category) %>% summarise(n = n()) %>%
    mutate(sample = i)
  stat_category <- bind_rows(stat_category, stat_temp)
  
  
  stat_temp <- wage_samp %>%
    mutate(ee_category = case_when(!is.na(ee_category) ~ ee_category,
                                   !is.na(total_hours) ~ ifelse(total_hours < 35,
                                                                "Part Time",
                                                                "Full Time"),
                                   T ~ "NA")) %>%
    group_by(ee_category) %>% summarise(n = n()) %>% mutate(sample = i)
  stat_category2 <- bind_rows(stat_category2, stat_temp)
  
  stat_temp <- wage_samp %>% group_by(ee_ethnicity) %>% summarise(n = n()) %>%
    mutate(sample = i)
  stat_ethnicity <- bind_rows(stat_ethnicity, stat_temp)
  
  #TODO: improve definition of age
  stat_temp <- wage_samp %>% mutate(age = year - ee_birthyear) %>%
    group_by(age) %>% summarise(n = n()) %>%
    mutate(sample = i)
  stat_age <- bind_rows(stat_age, stat_temp)
  
  stat_temp <- wage_samp %>% group_by(clt_industry_code_2d) %>% summarise(n = n()) %>%
    mutate(sample = i)
  stat_worker_industry <- bind_rows(stat_worker_industry, stat_temp)
  
  stat_temp <- wage_samp %>% group_by(clt_region) %>% summarise(n = n())  %>%
    mutate(sample = i)
  stat_region <- bind_rows(stat_region, stat_temp)
  
  rm(wage_samp)
  print(i)
}
ptm <- proc.time() - ptm

#### saving the statistics #######################

write_rds(stat_firm, paste0(project_dir,"/data/tmp_data/stat_firm.rds"))

write_rds(stat_industry, paste0(project_dir,"/data/tmp_data/stat_industry.rds"))

stat_firm_size2 <- stat_firm_size %>%
  group_by(size, sample) %>% summarise(n = n())
write_rds(stat_firm_size2, paste0(project_dir,"/data/tmp_data/stat_firm_size.rds"))

stat_firm_size_23 <- stat_firm_size_23 %>% group_by(size, sample) %>%
  summarise(n = n())
write_rds(stat_firm_size_23, paste0(project_dir,"/data/tmp_data/stat_firm_size_23.rds"))

write_rds(stat_worker, paste0(project_dir,"/data/tmp_data/stat_worker.rds"))

write_rds(stat_hourly, paste0(project_dir,"/data/tmp_data/stat_hourly.rds"))

write_rds(stat_gender, paste0(project_dir,"/data/tmp_data/stat_gender.rds"))

write_rds(stat_age, paste0(project_dir,"/data/tmp_data/stat_age.rds"))

write_rds(stat_category, paste0(project_dir,"/data/tmp_data/stat_category.rds"))

write_rds(stat_category2, paste0(project_dir,"/data/tmp_data/stat_category2.rds"))

write_rds(stat_ethnicity, paste0(project_dir,"/data/tmp_data/stat_ethnicity.rds"))

write_rds(stat_worker_industry, paste0(project_dir,"/data/tmp_data/stat_worker_industry.rds"))

write_rds(stat_region, paste0(project_dir,"/data/tmp_data/stat_region.rds"))

write_rds(stat_wage, paste0(project_dir,"/data/tmp_data/stat_wage.rds"))
##########################################################


rm(list=ls()) 
gc() #free up memory used by R
#########################################################

